{
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7-final"
  },
  "orig_nbformat": 2,
  "kernelspec": {
   "name": "Python 3.6.7 64-bit ('DataAnalysis': conda)",
   "display_name": "Python 3.6.7 64-bit ('DataAnalysis': conda)",
   "metadata": {
    "interpreter": {
     "hash": "07829f70181eff8a51ba0c91cec5a602c5da58bbdadabc96e22cfe4f56b0b078"
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2,
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "*** Introductory Examples for the NLTK Book ***\n",
      "Loading text1, ..., text9 and sent1, ..., sent9\n",
      "Type the name of the text or sentence to view it.\n",
      "Type: 'texts()' or 'sents()' to list the materials.\n",
      "text1: Moby Dick by Herman Melville 1851\n",
      "text2: Sense and Sensibility by Jane Austen 1811\n",
      "text3: The Book of Genesis\n",
      "text4: Inaugural Address Corpus\n",
      "text5: Chat Corpus\n",
      "text6: Monty Python and the Holy Grail\n",
      "text7: Wall Street Journal\n",
      "text8: Personals Corpus\n",
      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
     ]
    }
   ],
   "source": [
    "from tools import *\n",
    "from nltk.book import *\n",
    "%matplotlib inline"
   ]
  },
  {
   "source": [
    "# Chap1 语言处理与Python\n",
    "\n",
    "目的：\n",
    "\n",
    "1.  简单的程序如何与大规模的文本结合？\n",
    "2.  如何自动地提取出关键字和词组？如何使用它们来总结文本的风格和内容？\n",
    "3.  Python为文本处理提供了哪些工具和技术？\n",
    "4.  自然语言处理中还有哪些有趣的挑战？"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "## 1.4 回到 Python：决策与控制"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "source": [
    "### 1.4.1 条件"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "=============== >sent7< ===============\n['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']\n--------------- >[w for w in sent7 if len(w) < 4]< ---------------\n[',', '61', 'old', ',', 'the', 'as', 'a', '29', '.']\n--------------- >[w for w in sent7 if len(w) <= 4]< ---------------\n[',', '61', 'old', ',', 'will', 'join', 'the', 'as', 'a', 'Nov.', '29', '.']\n--------------- >[w for w in sent7 if len(w) == 4]< ---------------\n['will', 'join', 'Nov.']\n--------------- >[w for w in sent7 if len(w) != 4]< ---------------\n['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', '29', '.']\n"
     ]
    }
   ],
   "source": [
    "# 表1-3：数值比较运算符\n",
    "show_title(\"sent7\")\n",
    "print(sent7)\n",
    "show_subtitle(\"[w for w in sent7 if len(w) < 4]\")\n",
    "print([w for w in sent7 if len(w) < 4])\n",
    "show_subtitle(\"[w for w in sent7 if len(w) <= 4]\")\n",
    "print([w for w in sent7 if len(w) <= 4])\n",
    "show_subtitle(\"[w for w in sent7 if len(w) == 4]\")\n",
    "print([w for w in sent7 if len(w) == 4])\n",
    "show_subtitle(\"[w for w in sent7 if len(w) != 4]\")\n",
    "print([w for w in sent7 if len(w) != 4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >单词以'ableness'结尾< ---------------\n",
      "['comfortableness', 'honourableness', 'immutableness', 'indispensableness', 'indomitableness', 'intolerableness', 'palpableness', 'reasonableness', 'uncomfortableness']\n",
      "--------------- >单词中包含'gnt'< ---------------\n",
      "['Sovereignty', 'sovereignties', 'sovereignty']\n",
      "--------------- >首字母大写< ---------------\n",
      "['A', 'Aaaaaaaaah', 'Aaaaaaaah', 'Aaaaaah', 'Aaaah', 'Aaaaugh', 'Aaagh', 'Aaah', 'Aaauggh', 'Aaaugh', 'Aaauugh', 'Aagh', 'Aah', 'Aauuggghhh', 'Aauuugh', 'Aauuuuugh', 'Aauuuves', 'Action', 'Actually', 'African', 'Ages', 'Aggh', 'Agh', 'Ah', 'Ahh', 'Alice', 'All', 'Allo', 'Almighty', 'Alright', 'Am', 'Amen', 'An', 'Anarcho', 'And', 'Angnor', 'Anthrax', 'Antioch', 'Anybody', 'Anyway', 'Apples', 'Aramaic', 'Are', 'Arimathea', 'Armaments', 'Arthur', 'As', 'Ask', 'Assyria', 'At', 'Attila', 'Augh', 'Autumn', 'Auuuuuuuugh', 'Away', 'Ay', 'Ayy', 'B', 'Back', 'Bad', 'Badon', 'Battle', 'Be', 'Beast', 'Bedevere', 'Bedwere', 'Behold', 'Between', 'Beyond', 'Black', 'Bloody', 'Blue', 'Bon', 'Bones', 'Book', 'Bors', 'Brave', 'Bravely', 'Bravest', 'Bread', 'Bridge', 'Bring', 'Bristol', 'Britain', 'Britons', 'Brother', 'Build', 'Burn', 'But', 'By', 'C', 'Caerbannog', 'Camaaaaaargue', 'Camelot', 'Castle', 'Chapter', 'Charge', 'Chaste', 'Cherries', 'Chicken', 'Chickennn', 'Chop', 'Christ', 'Churches', 'Cider', 'Clark', 'Clear', 'Come', 'Concorde', 'Consult', 'Cornwall', 'Could', 'Course', 'Court', 'Crapper', 'Cut', 'Dappy', 'Death', 'Defeat', 'Dennis', 'Did', 'Didn', 'Dingo', 'Dis', 'Divine', 'Do', 'Doctor', 'Does', 'Don', 'Dragon', 'Dramatically', 'Ecky', 'Ector', 'Eee', 'Eh', 'Enchanter', 'England', 'English', 'Erbert', 'Ere', 'Erm', 'Eternal', 'European', 'Even', 'Every', 'Everything', 'Ewing', 'Exactly', 'Excalibur', 'Excuse', 'Explain', 'Far', 'Farewell', 'Father', 'Fetchez', 'Fiends', 'Fine', 'First', 'Firstly', 'Five', 'Follow', 'For', 'Forgive', 'Forward', 'Found', 'Four', 'France', 'Frank', 'French', 'Gable', 'Galahad', 'Gallahad', 'Gawain', 'Get', 'Go', 'God', 'Good', 'Gorge', 'Grail', 'Great', 'Greetings', 'Grenade', 'Guards', 'Guy', 'Ha', 'Hah', 'Hallo', 'Halt', 'Hand', 'Hang', 'Have', 'Haw', 'He', 'Hee', 'Heee', 'Heh', 'Hello', 'Help', 'Herbert', 'Here', 'Hey', 'Hic', 'Hill', 'Himself', 'His', 'Hiyaah', 'Hiyah', 'Hiyya', 'Hm', 'Hmm', 'Ho', 'Hoa', 'Hold', 'Holy', 'Honestly', 'Hoo', 'Hooray', 'How', 'Huh', 'Hurry', 'Huy', 'Huyah', 'Hya', 'Hyy', 'I', 'Idiom', 'Iesu', 'If', 'Iiiiives', 'Iiiives', 'In', 'Is', 'Isn', 'It', 'Ives', 'Jesus', 'Joseph', 'Just', 'Keep', 'King', 'Knight', 'Knights', 'Lady', 'Lake', 'Lancelot', 'Launcelot', 'Lead', 'Leaving', 'Let', 'Lie', 'Like', 'Listen', 'Loimbard', 'Look', 'Looks', 'Lord', 'Lucky', 'Make', 'Man', 'May', 'Maynard', 'Meanwhile', 'Mercea', 'Message', 'Midget', 'Mind', 'Mine', 'Mmm', 'Monsieur', 'More', 'Morning', 'Most', 'Mother', 'Mud', 'Must', 'My', 'N', 'Nador', 'Nay', 'Neee', 'Never', 'Ni', 'Nine', 'Ninepence', 'No', 'None', 'Not', 'Nothing', 'Now', 'Nu', 'O', 'Of', 'Off', 'Oh', 'Ohh', 'Old', 'Olfin', 'On', 'Once', 'One', 'Ooh', 'Oooh', 'Oooo', 'Oooohoohohooo', 'Oooooooh', 'Open', 'Or', 'Order', 'Other', 'Oui', 'Our', 'Over', 'Ow', 'Packing', 'Patsy', 'Pendragon', 'Peng', 'Perhaps', 'Peril', 'Picture', 'Pie', 'Piglet', 'Pin', 'Please', 'Practice', 'Prepare', 'Prince', 'Princess', 'Providence', 'Psalms', 'Pull', 'Pure', 'Put', 'Quick', 'Quickly', 'Quiet', 'Quite', 'Quoi', 'Rather', 'Really', 'Recently', 'Remove', 'Rheged', 'Ridden', 'Right', 'Riiight', 'Robin', 'Robinson', 'Roger', 'Round', 'Run', 'Running', 'S', 'Said', 'Saint', 'Saxons', 'Say', 'Schools', 'See', 'Seek', 'Shall', 'She', 'Shh', 'Shrubber', 'Shrubberies', 'Shut', 'Silence', 'Silly', 'Since', 'Sir', 'Skip', 'So', 'Sorry', 'Speak', 'Splendid', 'Spring', 'Stand', 'Stay', 'Steady', 'Stop', 'Summer', 'Supposing', 'Supreme', 'Surely', 'Swamp', 'Table', 'Tale', 'Tall', 'Tell', 'Thank', 'That', 'The', 'Thee', 'Then', 'There', 'Therefore', 'They', 'This', 'Those', 'Thou', 'Thpppppt', 'Thppppt', 'Thpppt', 'Thppt', 'Three', 'Throw', 'Thsss', 'Thursday', 'Thy', 'Til', 'Tim', 'Tis', 'To', 'Today', 'Together', 'Too', 'Torment', 'Tower', 'True', 'Try', 'Twenty', 'Two', 'U', 'Uh', 'Uhh', 'Ulk', 'Um', 'Umhm', 'Umm', 'Un', 'Unfortunately', 'Until', 'Use', 'Uther', 'Uugh', 'Uuh', 'Very', 'Victory', 'W', 'Waa', 'Wait', 'Walk', 'Wayy', 'We', 'Welcome', 'Well', 'What', 'When', 'Where', 'Which', 'Who', 'Whoa', 'Why', 'Will', 'Winston', 'Winter', 'With', 'Woa', 'Wood', 'Would', 'Y', 'Yapping', 'Yay', 'Yeaaah', 'Yeaah', 'Yeah', 'Yes', 'You', 'Your', 'Yup', 'Zoot']\n",
      "--------------- >单词是数字< ---------------\n",
      "['29', '61']\n"
     ]
    }
   ],
   "source": [
    "# 表1-4：词汇比较运算符\n",
    "show_subtitle(\"单词以'ableness'结尾\")\n",
    "print(sorted([w for w in set(text1) if w.endswith('ableness')]))\n",
    "show_subtitle(\"单词中包含'gnt'\")\n",
    "print(sorted([term for term in set(text4) if 'gnt' in term]))\n",
    "show_subtitle(\"首字母大写\")\n",
    "print(sorted([item for item in set(text6) if item.istitle()])) \n",
    "show_subtitle(\"单词是数字\")\n",
    "print(sorted([item for item in set(sent7) if item.isdigit()]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "--------------- >单词中包含 '-' 和 'index'< ---------------\n",
      "['Stock-index', 'index-arbitrage', 'index-fund', 'index-options', 'index-related', 'stock-index']\n",
      "--------------- >单词首字母大写，并且单词长度超过10< ---------------\n",
      "['Abelmizraim', 'Allonbachuth', 'Beerlahairoi', 'Canaanitish', 'Chedorlaomer', 'Girgashites', 'Hazarmaveth', 'Hazezontamar', 'Ishmeelites', 'Jegarsahadutha', 'Jehovahjireh', 'Kirjatharba', 'Melchizedek', 'Mesopotamia', 'Peradventure', 'Philistines', 'Zaphnathpaaneah']\n",
      "--------------- >单词首字母不是小写< ---------------\n",
      "[',', '.', '29', '61', 'Nov.', 'Pierre', 'Vinken']\n",
      "--------------- >单词中含有 'cie' 或者 'cei'< ---------------\n",
      "['ancient', 'ceiling', 'conceit', 'conceited', 'conceive', 'conscience', 'conscientious', 'conscientiously', 'deceitful', 'deceive', 'deceived', 'deceiving', 'deficiencies', 'deficiency', 'deficient', 'delicacies', 'excellencies', 'fancied', 'insufficiency', 'insufficient', 'legacies', 'perceive', 'perceived', 'perceiving', 'prescience', 'prophecies', 'receipt', 'receive', 'received', 'receiving', 'society', 'species', 'sufficient', 'sufficiently', 'undeceive', 'undeceiving']\n"
     ]
    }
   ],
   "source": [
    "show_subtitle(\"单词中包含 '-' 和 'index'\")\n",
    "print(sorted([w for w in set(text7) if '-' in w and 'index' in w]))\n",
    "show_subtitle(\"单词首字母大写，并且单词长度超过10\")\n",
    "print(sorted([wd for wd in set(text3) if wd.istitle() and len(wd) > 10]))\n",
    "show_subtitle(\"单词首字母不是小写\")\n",
    "print(sorted([w for w in set(sent7) if not w.islower()]))\n",
    "show_subtitle(\"单词中含有 'cie' 或者 'cei'\")\n",
    "print(sorted([t for t in set(text2) if 'cie' in t or 'cei' in t]))"
   ]
  },
  {
   "source": [
    "### 1.4.2 操作每个元素"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "tags": []
   },
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "text1中每个单词的长度=  [1, 4, 4, 2, 6, 8, 4, 1, 9, 1, 1, 8, 2, 1, 4, 11, 5, 2, 1, 7, 6, 1, 3, 4, 5, 2, 10, 2, 4, 1, 5, 1, 4, 1, 3, 5, 1, 1, 3, 3, 3, 1, 2, 3, 4, 7, 3, 3, 8, 3]\ntext1中每个单词都大写=  ['[', 'MOBY', 'DICK', 'BY', 'HERMAN', 'MELVILLE', '1851', ']', 'ETYMOLOGY', '.', '(', 'SUPPLIED', 'BY', 'A', 'LATE', 'CONSUMPTIVE', 'USHER', 'TO', 'A', 'GRAMMAR', 'SCHOOL', ')', 'THE', 'PALE', 'USHER', '--', 'THREADBARE', 'IN', 'COAT', ',', 'HEART', ',', 'BODY', ',', 'AND', 'BRAIN', ';', 'I', 'SEE', 'HIM', 'NOW', '.', 'HE', 'WAS', 'EVER', 'DUSTING', 'HIS', 'OLD', 'LEXICONS', 'AND']\n"
     ]
    }
   ],
   "source": [
    "print(\"text1中每个单词的长度= \",[len(w) for w in text1][0:50])\n",
    "print(\"text1中每个单词都大写= \",[w.upper() for w in text1][0:50])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "len(text1)=  260819\n",
      "len(set(text1))=  19317\n",
      "len([word.lower() for word in set(text1)])=  19317\n",
      "len(set([word.lower() for word in text1]))=  17231\n",
      "len(set([word.lower() for word in text1 if word.isalpha()]))=  16948\n"
     ]
    }
   ],
   "source": [
    "# 进一步理解 Python 的 「链表推导」\n",
    "# set()：去除重复的单词\n",
    "print(\"len(text1)= \", len(text1))\n",
    "print(\"len(set(text1))= \", len(set(text1)))\n",
    "# lower()：小写单词\n",
    "print(\"len([word.lower() for word in set(text1)])= \", len([word.lower() for word in set(text1)]))\n",
    "print(\"len(set([word.lower() for word in text1]))= \", len(set([word.lower() for word in text1])))\n",
    "# word.isalpha()：如果 word 没有字母就返回 False\n",
    "print(\"len(set([word.lower() for word in text1 if word.isalpha()]))= \",\n",
    "      len(set([word.lower() for word in text1 if word.isalpha()])))"
   ]
  },
  {
   "source": [
    "### 1.4.3 嵌套代码块"
   ],
   "cell_type": "markdown",
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "单词(cat)长度小于 5\n"
     ]
    }
   ],
   "source": [
    "# 控制结构\n",
    "word = 'cat'\n",
    "\n",
    "if len(word) < 5:\n",
    "    print(f\"单词({word})长度小于 5\")\n",
    "\n",
    "if len(word) >= 5:\n",
    "    print(f\"单词({word})长度大于等于 5\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Call\nme\nIshmael\n.\n"
     ]
    }
   ],
   "source": [
    "# 循环结构\n",
    "for word in ['Call', 'me', 'Ishmael', '.']:\n",
    "    print(word)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Call\nIshmael\n"
     ]
    }
   ],
   "source": [
    "# 条件循环\n",
    "sent1 = ['Call', 'me', 'Ishmael', '.']\n",
    "for xyzzy in sent1:\n",
    "    if xyzzy.endswith('l'):\n",
    "        print(xyzzy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "Call is a title case word\nme is a lower case word\nIshmael is a title case word\n. is punctuation\n"
     ]
    }
   ],
   "source": [
    "# 更加复杂的条件控制\n",
    "for token in sent1:\n",
    "    if token.islower():\n",
    "        print(token, 'is a lower case word')\n",
    "    elif token.istitle():\n",
    "        print(token, 'is a title case word')\n",
    "    else:\n",
    "        print(token, \"is punctuation\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "output_type": "stream",
     "name": "stdout",
     "text": [
      "ancient ceiling conceit conceited conceive conscience conscientious conscientiously deceitful deceive deceived deceiving deficiencies deficiency deficient delicacies excellencies fancied insufficiency insufficient legacies perceive perceived perceiving prescience prophecies receipt receive received receiving society species sufficient sufficiently undeceive undeceiving "
     ]
    }
   ],
   "source": [
    "# 习惯用法组合\n",
    "tricky = sorted(w for w in set(text2) if 'cie' in w or 'cei' in w)\n",
    "for word in tricky:\n",
    "    print(word, end=' ')"
   ]
  }
 ]
}